library(readr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
day <- read_csv("cabi.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## season = col_integer(),
## year = col_integer(),
## month = col_integer(),
## wday = col_integer(),
## hr = col_integer(),
## temp = col_double(),
## atemp = col_double(),
## hum = col_double(),
## windspeed = col_double(),
## weather = col_integer(),
## casual = col_integer(),
## registered = col_integer()
## )
day$X1 <- NULL
# a
ggplot(day, aes(x = as.factor(year), y = casual)) + geom_boxplot() +
labs(title="Boxplot of Causal Riders",
subtitle="Variation by Year",
x = "Year",
y = "Number of Riders") + scale_x_discrete(labels=c("2011", "2012"))
ggplot(day, aes(x = as.factor(year), y = registered)) + geom_boxplot() +
labs(title="Boxplot of Registered Riders",
subtitle="Variation by Year",
x = "Year",
y = "Number of Riders") + scale_x_discrete(labels=c("2011", "2012"))
ggplot(day, aes(x = as.factor(month), y = casual)) + geom_boxplot() +
labs(title="Casual Riders",
subtitle="Variation by Month",
x = "Month",
y = "Number of Riders") +
scale_x_discrete(labels=c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec"))
ggplot(day, aes(x = as.factor(month), y = registered)) + geom_boxplot() +
labs(title="Registered Riders",
subtitle="Variation by Month",
x = "Month",
y = "Number of Riders") +
scale_x_discrete(labels=c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec"))
ggplot(day, aes(x = (hr + 1), y = casual)) + geom_bar(stat = "identity") +
labs(title="Casual Riders",
subtitle="Variation by Hour of Day",
x = "Hour",
y = "Number of Riders") +
scale_x_continuous(limits=c(1, 24)) +
scale_y_continuous(limits=c(0, 300000))
ggplot(day, aes(x = (hr + 1), y = registered)) + geom_bar(stat = "identity") +
labs(title="Registered Riders",
subtitle="Variation by Hour of Day",
x = "Hour",
y = "Number of Riders") +
scale_x_continuous(limits=c(1, 24)) +
scale_y_continuous(limits=c(0, 300000))
# b
ggplot(day, aes(x = casual, y = registered)) + geom_jitter(aes(colour = as.factor(year))) +
geom_smooth(aes(colour = as.factor(year))) + scale_color_manual(labels = c("2011", "2012"), values = c("blue", "red")) + scale_x_continuous(limits = c(0,1000)) + scale_y_continuous(limits = c(0,1000))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 791 rows containing missing values (geom_point).
ggplot(day, aes(x = casual, y = registered)) + geom_jitter(aes(colour = as.factor(wday))) +
geom_smooth(aes(colour = as.factor(wday))) + scale_color_manual(labels = c("Not a working day", "Working Day"), values = c("blue", "red")) + scale_x_continuous(limits = c(0,1000)) + scale_y_continuous(limits = c(0,1000))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 789 rows containing missing values (geom_point).
# c
ggplot(day, aes(x = as.factor(weather), y = casual)) + geom_bar(stat = "identity") +
labs(title="Casual Riders",
subtitle="Variation by Weather",
x = "Weather Type",
y = "Number of Riders") +
scale_x_discrete(labels=c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"))
ggplot(day, aes(x = as.factor(weather), y = registered)) + geom_bar(stat = "identity") +
labs(title="Registered Riders",
subtitle="Variation by Weather",
x = "Weather Type",
y = "Number of Riders") +
scale_x_discrete(labels=c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"))
# d
ggplot(day, aes(x = (hr + 1), y = casual, fill = as.factor(weather))) +
geom_bar(stat = "identity") + scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green")) + labs(title="Casual Riders",
subtitle="Variation by Time of Day and Weather Type",
x = "Hour of Day",
y = "Number of Riders")
ggplot(day, aes(x = (hr + 1), y = registered, fill = as.factor(weather))) + geom_bar(stat = "identity") + scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green")) + labs(title="Registered Riders",
subtitle="Variation by Time of Day and Weather Type",
x = "Hour of Day",
y = "Number of Riders")
ggplot(day, aes(x = as.factor(month), y = casual, fill = as.factor(weather))) +
geom_bar(stat = "identity") + scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green")) + labs(title="Casual Riders",
subtitle="Variation by Time of Day and Weather Type",
x = "Month",
y = "Number of Riders")
ggplot(day, aes(as.factor(month), y = registered, fill = as.factor(weather))) + geom_bar(stat = "identity") + scale_fill_manual(labels = c("Clear", "Cloudy/Fog", "Rain/Snow/Thunder"), values = c("#42a7f4", "red", "green")) + labs(title="Registered Riders",
subtitle="Variation by Time of Day and Weather Type",
x = "Month",
y = "Number of Riders")
(a). By comparing the boxplots we can see the number of casual riders (e.g. tourists) inreased from 2011 to 2012 although the median value for both years was not hugely different. However 2012 had more outlier days with a high number of riders.
The number of registered riders increased more significantly, implying that more people began to use the Capital Bikeshare service to commute around the city in place of public transport/private vehicles.
For both casual and registered riders there are a greater number of rides during the summer months, which makes sense as it is more comfortable to bike around in the city as compared to winter. The number of registered users is higher in winter as compared to casual riders, though this might just be because fewer tourists come to DC in the cold.
Finally we see that registered riders usually peak around office commuting hours (e.g. before 9am and after 5pm) which shows that a lot of these users are going to and from work. On the other hand for casual riders the peak is around late afternoon/evening when most tourists would be sightseeing etc.
(b). There seems to be a positive relationship between registered users and casual users, implying that other factors, such as weather seem to affect both similarly. On a clear ay more riders will be using the service, both casual and registered. The effect does not vary as much year, as both curves follow a similar trajectory.
However the effect does vary by working day.In case of a working day we see there tend to be more registered users, which shows that these people need to get to work whereas for casual users the increase is much gentler implying lesser urgency.
(c). There is a strong relationship between the weather situation and ridership counts. Biking seems to be an activity strongly associated with the weather conditions as many individuals choose not to bike when it is rainy or snowing. This holds true for both casual and registered riders. There is a steep drop in ridership from clear to cloudy, and there are hardly any riders during the snow/rain.
(d). We notice that ridership is almost zero if the weather is snowy/raining in the morning or late at night, for both casual and registered readers. However there are a few riders during the middle of the day, although it is larger for registered riders.
There is a slightly larger number of riders in the summer months for rainy/snowy weather as compared to the winter.
train1 <- sample(nrow(day),(nrow(day) * .70), replace = FALSE)
day_train <- day[train1,]
day_test <- day[-train1,]
# a
mlr_reg <- lm(registered ~ factor(season) + year + month + wday + hr + temp + atemp + hum + windspeed +
factor(weather), data = day_train)
summary(mlr_reg)
##
## Call:
## lm(formula = registered ~ factor(season) + year + month + wday +
## hr + temp + atemp + hum + windspeed + factor(weather), data = day_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -287.68 -77.97 -25.75 46.67 616.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.433e+05 4.523e+03 -31.693 < 2e-16 ***
## factor(season)2 1.568e+01 4.093e+00 3.830 0.000129 ***
## factor(season)3 6.751e+00 5.809e+00 1.162 0.245210
## factor(season)4 5.276e+01 5.764e+00 9.153 < 2e-16 ***
## year 7.125e+01 2.248e+00 31.693 < 2e-16 ***
## month 4.720e-01 6.083e-01 0.776 0.437811
## wday 4.028e+01 2.403e+00 16.763 < 2e-16 ***
## hr 6.370e+00 1.733e-01 36.750 < 2e-16 ***
## temp 1.314e+02 3.905e+01 3.364 0.000770 ***
## atemp 1.008e+02 4.220e+01 2.388 0.016933 *
## hum -1.231e+02 7.211e+00 -17.064 < 2e-16 ***
## windspeed 3.289e+01 9.992e+00 3.292 0.000998 ***
## factor(weather)2 9.198e+00 2.727e+00 3.373 0.000745 ***
## factor(weather)3 -2.832e+01 4.604e+00 -6.151 7.92e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123.1 on 12151 degrees of freedom
## Multiple R-squared: 0.3409, Adjusted R-squared: 0.3402
## F-statistic: 483.5 on 13 and 12151 DF, p-value: < 2.2e-16
# b
predicted_reg <- predict(mlr_reg, day_test, interval = "confidence")
mean((predicted_reg - day_test$registered)^2)
## [1] 15008.7
# c
mean(mlr_reg$residuals^2)
## [1] 15128.27
mlr_reg3 <- lm(registered ~ factor(season) + year + factor(month) + wday + hr + temp + atemp + hum + windspeed + factor(weather), data = day_train)
summary(mlr_reg3)
##
## Call:
## lm(formula = registered ~ factor(season) + year + factor(month) +
## wday + hr + temp + atemp + hum + windspeed + factor(weather),
## data = day_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -277.50 -77.45 -25.79 45.93 616.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.414e+05 4.517e+03 -31.304 < 2e-16 ***
## factor(season)2 3.700e+01 7.051e+00 5.248 1.57e-07 ***
## factor(season)3 2.421e+01 8.312e+00 2.913 0.003588 **
## factor(season)4 6.347e+01 7.085e+00 8.957 < 2e-16 ***
## year 7.029e+01 2.245e+00 31.306 < 2e-16 ***
## factor(month)2 -4.139e+00 5.651e+00 -0.732 0.463919
## factor(month)3 -1.368e+01 6.286e+00 -2.176 0.029576 *
## factor(month)4 -3.480e+01 9.317e+00 -3.735 0.000189 ***
## factor(month)5 -2.717e+01 9.931e+00 -2.736 0.006236 **
## factor(month)6 -4.663e+01 1.011e+01 -4.613 4.01e-06 ***
## factor(month)7 -5.298e+01 1.141e+01 -4.644 3.45e-06 ***
## factor(month)8 -2.569e+01 1.117e+01 -2.300 0.021477 *
## factor(month)9 -3.155e+00 1.001e+01 -0.315 0.752514
## factor(month)10 -1.539e+01 9.335e+00 -1.648 0.099348 .
## factor(month)11 -2.463e+01 8.998e+00 -2.737 0.006201 **
## factor(month)12 -5.527e+00 7.221e+00 -0.765 0.444007
## wday 3.992e+01 2.401e+00 16.627 < 2e-16 ***
## hr 6.186e+00 1.748e-01 35.391 < 2e-16 ***
## temp 1.503e+02 4.080e+01 3.683 0.000231 ***
## atemp 1.118e+02 4.263e+01 2.623 0.008724 **
## hum -1.313e+02 7.351e+00 -17.862 < 2e-16 ***
## windspeed 3.062e+01 1.005e+01 3.046 0.002328 **
## factor(weather)2 7.895e+00 2.730e+00 2.892 0.003835 **
## factor(weather)3 -2.813e+01 4.593e+00 -6.124 9.40e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 122.6 on 12141 degrees of freedom
## Multiple R-squared: 0.3465, Adjusted R-squared: 0.3452
## F-statistic: 279.8 on 23 and 12141 DF, p-value: < 2.2e-16
# d
day_train2 <- day_train
day_test2 <- day_test
day_train2$hr <- as.factor(day_train2$hr)
day_test2$hr <- as.factor(day_test2$hr)
mlr_reg2 <- lm(registered ~ factor(season) + year + month + wday + factor(hr) + temp + atemp + hum + windspeed + factor(weather), data = day_train)
summary(mlr_reg2)
##
## Call:
## lm(formula = registered ~ factor(season) + year + month + wday +
## factor(hr) + temp + atemp + hum + windspeed + factor(weather),
## data = day_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -343.96 -48.94 -6.37 44.44 440.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.478e+05 3.166e+03 -46.690 < 2e-16 ***
## factor(season)2 3.103e+01 2.898e+00 10.708 < 2e-16 ***
## factor(season)3 3.224e+01 4.134e+00 7.800 6.69e-15 ***
## factor(season)4 6.184e+01 4.032e+00 15.338 < 2e-16 ***
## year 7.347e+01 1.574e+00 46.679 < 2e-16 ***
## month -3.997e-01 4.253e-01 -0.940 0.347250
## wday 4.166e+01 1.679e+00 24.808 < 2e-16 ***
## factor(hr)1 -8.209e+00 5.397e+00 -1.521 0.128272
## factor(hr)2 -1.969e+01 5.474e+00 -3.598 0.000322 ***
## factor(hr)3 -2.720e+01 5.435e+00 -5.005 5.67e-07 ***
## factor(hr)4 -3.307e+01 5.390e+00 -6.135 8.76e-10 ***
## factor(hr)5 -1.603e+01 5.439e+00 -2.947 0.003213 **
## factor(hr)6 3.752e+01 5.424e+00 6.917 4.85e-12 ***
## factor(hr)7 1.696e+02 5.391e+00 31.457 < 2e-16 ***
## factor(hr)8 3.012e+02 5.370e+00 56.082 < 2e-16 ***
## factor(hr)9 1.451e+02 5.325e+00 27.253 < 2e-16 ***
## factor(hr)10 7.783e+01 5.422e+00 14.354 < 2e-16 ***
## factor(hr)11 9.655e+01 5.409e+00 17.852 < 2e-16 ***
## factor(hr)12 1.259e+02 5.536e+00 22.738 < 2e-16 ***
## factor(hr)13 1.216e+02 5.498e+00 22.123 < 2e-16 ***
## factor(hr)14 1.016e+02 5.443e+00 18.666 < 2e-16 ***
## factor(hr)15 1.144e+02 5.492e+00 20.832 < 2e-16 ***
## factor(hr)16 1.749e+02 5.483e+00 31.895 < 2e-16 ***
## factor(hr)17 3.293e+02 5.501e+00 59.871 < 2e-16 ***
## factor(hr)18 3.103e+02 5.464e+00 56.793 < 2e-16 ***
## factor(hr)19 2.104e+02 5.458e+00 38.543 < 2e-16 ***
## factor(hr)20 1.384e+02 5.383e+00 25.718 < 2e-16 ***
## factor(hr)21 9.823e+01 5.370e+00 18.293 < 2e-16 ***
## factor(hr)22 6.376e+01 5.363e+00 11.889 < 2e-16 ***
## factor(hr)23 3.285e+01 5.380e+00 6.105 1.06e-09 ***
## temp 9.284e+01 2.736e+01 3.394 0.000692 ***
## atemp 6.491e+01 2.949e+01 2.201 0.027766 *
## hum -4.561e+01 5.496e+00 -8.298 < 2e-16 ***
## windspeed -1.266e+01 7.016e+00 -1.805 0.071147 .
## factor(weather)2 -5.312e+00 1.936e+00 -2.744 0.006075 **
## factor(weather)3 -5.310e+01 3.275e+00 -16.214 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 85.96 on 12129 degrees of freedom
## Multiple R-squared: 0.679, Adjusted R-squared: 0.6781
## F-statistic: 733.1 on 35 and 12129 DF, p-value: < 2.2e-16
predicted_reg2 <- predict(mlr_reg2, day_test, interval = "confidence")
mean((predicted_reg2 - day_test$registered)^2)
## [1] 7442.71
(a). From our regression output we see that the variables season (for categories 2 and 4), year, wday, hr, tmemp and hum are highly significant (p-value much lower than 0.001). Both categories for the weather variable are also significant at the 1% level along with windspeed . The atemp variable is significant at the 5% level.
This significance level implies that all of these variables have a beta coefficient which is significantly different from zero and we can interpret the effect of these coefficients. For example, the “1.370e+02” coefficient on the temp variable implies that a 1 unit increase in temperature is associated with an increase of 137 registered riders per hour holding all other variables constant.
We see that the the number of registered riders increases on a working day, if the year is 2012, if the temperature and windspeed increase. Similarly there is a negative relationship between registered riders and inreasing humidity, and if the weather is rainy or snowing.
These results make sense as registered riders are often those that use the bikes to commute to and from work, but might look at alternatives in case of adverse weather.
(b). The RMSE value is 15032.7.
(c).
(d). We see that Adj R squared value almost doubles if we include the hour variable as a factor instead of as a numeric. Howevere the test RMSE value also falls to 7448.248, which indicates the improvement is not just for the training data but also the test data.
This is because incluing the hr variable as numeric implies that there is an increasing relationship within our variable with hr 23 being greater than hr 22, which is not accurate. For time periods although one might occur later, there is no independant magnitude which would be greater for one category or another. This is why we should include the hr variable as factor.
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(nnet)
nnet_MSE <- vector()
for(i in 1:10){
reg_nnet1 <- nnet(registered/1000 ~ temp + atemp + hum + windspeed + factor(weather), data = day_train, size = (2*i), maxit = 1000, decay = 0.1)
predict1_reg <- predict(reg_nnet1, day_test, type = "raw") * 1000
nnet_MSE[i] <- mean((predict1_reg - day_test$registered)^2)
}
## # weights: 17
## initial value 1675.853583
## iter 10 value 268.599038
## iter 20 value 231.213600
## iter 30 value 229.899702
## iter 40 value 229.784455
## iter 50 value 229.769561
## iter 60 value 229.763914
## iter 70 value 229.032319
## iter 80 value 228.721037
## final value 228.690161
## converged
## # weights: 33
## initial value 1795.626896
## iter 10 value 351.614877
## iter 20 value 237.027525
## iter 30 value 231.120209
## iter 40 value 230.515734
## iter 50 value 229.416492
## iter 60 value 228.789027
## iter 70 value 228.629060
## iter 80 value 228.537359
## iter 90 value 228.475042
## iter 100 value 228.455153
## iter 110 value 228.439874
## iter 120 value 228.436346
## iter 130 value 228.432900
## final value 228.432165
## converged
## # weights: 49
## initial value 2099.166275
## iter 10 value 407.358969
## iter 20 value 235.320337
## iter 30 value 230.689236
## iter 40 value 229.291453
## iter 50 value 228.655584
## iter 60 value 228.488306
## iter 70 value 228.458036
## iter 80 value 228.437318
## iter 90 value 228.434733
## iter 100 value 228.432588
## iter 110 value 228.432067
## iter 120 value 228.428298
## iter 130 value 228.419965
## iter 140 value 228.414935
## iter 150 value 228.413211
## iter 160 value 228.412610
## final value 228.412489
## converged
## # weights: 65
## initial value 2872.425309
## iter 10 value 409.754883
## iter 20 value 238.143274
## iter 30 value 231.361011
## iter 40 value 229.084033
## iter 50 value 228.672595
## iter 60 value 228.548998
## iter 70 value 228.508985
## iter 80 value 228.492641
## iter 90 value 228.475269
## iter 100 value 228.470576
## iter 110 value 228.468222
## iter 120 value 228.466297
## iter 130 value 228.464163
## iter 140 value 228.458402
## iter 150 value 228.446421
## iter 160 value 228.436628
## iter 170 value 228.433387
## final value 228.433185
## converged
## # weights: 81
## initial value 5310.234521
## iter 10 value 602.528340
## iter 20 value 249.478824
## iter 30 value 234.018369
## iter 40 value 230.473177
## iter 50 value 229.807436
## iter 60 value 229.232351
## iter 70 value 228.894520
## iter 80 value 228.724806
## iter 90 value 228.615850
## iter 100 value 228.573740
## iter 110 value 228.546755
## iter 120 value 228.517395
## iter 130 value 228.496754
## iter 140 value 228.482446
## iter 150 value 228.472914
## iter 160 value 228.467285
## iter 170 value 228.462101
## iter 180 value 228.455453
## iter 190 value 228.430348
## iter 200 value 228.421417
## iter 210 value 228.412573
## iter 220 value 228.407194
## iter 230 value 228.404946
## iter 240 value 228.403798
## final value 228.403776
## converged
## # weights: 97
## initial value 1831.686640
## iter 10 value 555.106860
## iter 20 value 279.233461
## iter 30 value 239.790060
## iter 40 value 233.660189
## iter 50 value 231.627258
## iter 60 value 230.720342
## iter 70 value 230.271586
## iter 80 value 229.801900
## iter 90 value 229.394380
## iter 100 value 229.088192
## iter 110 value 228.936604
## iter 120 value 228.818361
## iter 130 value 228.717980
## iter 140 value 228.591154
## iter 150 value 228.498441
## iter 160 value 228.452300
## iter 170 value 228.433040
## iter 180 value 228.423703
## iter 190 value 228.419080
## iter 200 value 228.414043
## iter 210 value 228.402723
## iter 220 value 228.391760
## iter 230 value 228.388673
## iter 240 value 228.387435
## iter 250 value 228.386966
## final value 228.386885
## converged
## # weights: 113
## initial value 969.872339
## iter 10 value 448.144864
## iter 20 value 237.243674
## iter 30 value 230.607019
## iter 40 value 229.278978
## iter 50 value 228.893954
## iter 60 value 228.686854
## iter 70 value 228.534759
## iter 80 value 228.479707
## iter 90 value 228.458600
## iter 100 value 228.437948
## iter 110 value 228.409073
## iter 120 value 228.399894
## iter 130 value 228.393517
## iter 140 value 228.389318
## iter 150 value 228.387697
## iter 160 value 228.386907
## iter 170 value 228.386387
## iter 180 value 228.386208
## iter 190 value 228.385853
## iter 200 value 228.385559
## iter 210 value 228.385272
## iter 220 value 228.385063
## iter 230 value 228.384928
## final value 228.384706
## converged
## # weights: 129
## initial value 624.675306
## iter 10 value 319.304215
## iter 20 value 232.442131
## iter 30 value 229.284033
## iter 40 value 228.612203
## iter 50 value 228.447207
## iter 60 value 228.418532
## iter 70 value 228.404752
## iter 80 value 228.393565
## iter 90 value 228.389538
## iter 100 value 228.385678
## iter 110 value 228.383310
## iter 120 value 228.382176
## iter 130 value 228.381412
## iter 140 value 228.380536
## iter 150 value 228.380042
## iter 160 value 228.379853
## iter 170 value 228.379718
## iter 180 value 228.379582
## iter 190 value 228.379428
## iter 200 value 228.379317
## iter 210 value 228.379141
## final value 228.379128
## converged
## # weights: 145
## initial value 5513.409489
## iter 10 value 598.136681
## iter 20 value 247.071094
## iter 30 value 233.232057
## iter 40 value 230.113529
## iter 50 value 229.279411
## iter 60 value 228.847895
## iter 70 value 228.666974
## iter 80 value 228.552384
## iter 90 value 228.453861
## iter 100 value 228.429803
## iter 110 value 228.412610
## iter 120 value 228.402055
## iter 130 value 228.395951
## iter 140 value 228.390529
## iter 150 value 228.388155
## iter 160 value 228.386200
## iter 170 value 228.384100
## iter 180 value 228.382972
## iter 190 value 228.381984
## iter 200 value 228.381189
## iter 210 value 228.380831
## iter 220 value 228.380349
## iter 230 value 228.379244
## iter 240 value 228.378608
## iter 250 value 228.378111
## iter 260 value 228.377504
## iter 270 value 228.376991
## iter 280 value 228.376721
## iter 290 value 228.376631
## iter 300 value 228.376529
## iter 310 value 228.376317
## iter 320 value 228.375820
## final value 228.375802
## converged
## # weights: 161
## initial value 2306.219567
## iter 10 value 598.910138
## iter 20 value 259.437874
## iter 30 value 234.071219
## iter 40 value 230.926099
## iter 50 value 230.186905
## iter 60 value 229.773685
## iter 70 value 229.123673
## iter 80 value 228.959685
## iter 90 value 228.804903
## iter 100 value 228.734505
## iter 110 value 228.663298
## iter 120 value 228.627456
## iter 130 value 228.595155
## iter 140 value 228.553885
## iter 150 value 228.522855
## iter 160 value 228.496380
## iter 170 value 228.455721
## iter 180 value 228.429146
## iter 190 value 228.421320
## iter 200 value 228.415367
## iter 210 value 228.412032
## iter 220 value 228.408500
## iter 230 value 228.405030
## iter 240 value 228.401535
## iter 250 value 228.395979
## iter 260 value 228.392155
## iter 270 value 228.388526
## iter 280 value 228.386922
## iter 290 value 228.385177
## iter 300 value 228.384460
## iter 310 value 228.384100
## iter 320 value 228.383679
## iter 330 value 228.383407
## final value 228.383288
## converged
nnet_MSE
## [1] 18264.38 18222.89 18219.15 18215.09 18213.65 18214.21 18213.63
## [8] 18212.47 18211.64 18210.81
nnet_MSE2 <- vector()
for(i in 1:10){
reg_nnet2 <- nnet(registered/1000 ~ factor(season) + year + factor(month) + wday + factor(hr), data = day_train, size = (2*i), maxit = 1000, decay = 0.1)
predict2_reg <- predict(reg_nnet2, day_test, type ="raw") * 1000
nnet_MSE2[i] <- mean((predict2_reg - day_test$registered)^2)
}
## # weights: 83
## initial value 1939.542557
## iter 10 value 591.693955
## iter 20 value 279.235340
## iter 30 value 265.290538
## iter 40 value 150.407739
## iter 50 value 107.298722
## iter 60 value 102.662679
## iter 70 value 101.786570
## iter 80 value 101.507340
## iter 90 value 101.465131
## iter 100 value 97.494341
## iter 110 value 82.157325
## iter 120 value 78.314104
## iter 130 value 77.537793
## iter 140 value 77.315956
## iter 150 value 77.128635
## iter 160 value 76.783126
## iter 170 value 76.673205
## iter 180 value 76.672201
## final value 76.672191
## converged
## # weights: 165
## initial value 1302.124488
## iter 10 value 302.747572
## iter 20 value 279.323194
## iter 30 value 279.269871
## iter 40 value 277.472565
## iter 50 value 261.106766
## iter 60 value 124.485937
## iter 70 value 108.912464
## iter 80 value 105.839188
## iter 90 value 102.421934
## iter 100 value 101.357195
## iter 110 value 101.317162
## iter 120 value 101.310825
## iter 130 value 101.310070
## iter 140 value 101.293202
## iter 150 value 101.059282
## iter 160 value 100.462739
## iter 170 value 97.028932
## iter 180 value 89.297669
## iter 190 value 81.349514
## iter 200 value 77.303062
## iter 210 value 76.328034
## iter 220 value 75.909786
## iter 230 value 75.529607
## iter 240 value 75.157328
## iter 250 value 74.863718
## iter 260 value 74.695440
## iter 270 value 74.599103
## iter 280 value 74.535897
## iter 290 value 74.504359
## iter 300 value 74.495457
## iter 310 value 74.491632
## iter 320 value 74.490761
## iter 330 value 74.490450
## iter 340 value 74.490124
## iter 350 value 74.489123
## iter 360 value 74.487224
## iter 370 value 74.486294
## iter 380 value 74.485882
## iter 390 value 74.485807
## final value 74.485802
## converged
## # weights: 247
## initial value 1572.139122
## iter 10 value 464.453853
## iter 20 value 279.377047
## iter 30 value 279.339578
## iter 40 value 271.457206
## iter 50 value 145.617987
## iter 60 value 117.544149
## iter 70 value 107.856451
## iter 80 value 104.340810
## iter 90 value 103.018315
## iter 100 value 100.824406
## iter 110 value 88.131605
## iter 120 value 80.393060
## iter 130 value 78.949832
## iter 140 value 77.816632
## iter 150 value 76.930536
## iter 160 value 76.695898
## iter 170 value 76.582831
## iter 180 value 76.298330
## iter 190 value 76.044900
## iter 200 value 75.976414
## iter 210 value 75.972840
## iter 220 value 75.972486
## iter 230 value 75.972402
## iter 240 value 75.972360
## final value 75.972095
## converged
## # weights: 329
## initial value 2889.035097
## iter 10 value 305.725490
## iter 20 value 279.276063
## iter 30 value 279.273827
## iter 40 value 268.736290
## iter 50 value 126.894475
## iter 60 value 113.185060
## iter 70 value 105.422161
## iter 80 value 101.027774
## iter 90 value 96.977445
## iter 100 value 92.117469
## iter 110 value 84.747706
## iter 120 value 80.441749
## iter 130 value 78.364698
## iter 140 value 77.198279
## iter 150 value 76.876970
## iter 160 value 76.585014
## iter 170 value 75.749262
## iter 180 value 75.253763
## iter 190 value 75.027030
## iter 200 value 74.900812
## iter 210 value 74.831110
## iter 220 value 74.737075
## iter 230 value 74.599880
## iter 240 value 74.431332
## iter 250 value 74.355942
## iter 260 value 74.320633
## iter 270 value 74.299944
## iter 280 value 74.291012
## iter 290 value 74.284789
## iter 300 value 74.280283
## iter 310 value 74.277607
## iter 320 value 74.270934
## iter 330 value 74.254071
## iter 340 value 74.242653
## iter 350 value 74.234506
## iter 360 value 74.225584
## iter 370 value 74.209666
## iter 380 value 74.195302
## iter 390 value 74.183845
## iter 400 value 74.170661
## iter 410 value 74.144997
## iter 420 value 74.094548
## iter 430 value 74.056502
## iter 440 value 74.038405
## iter 450 value 74.032448
## iter 460 value 74.030241
## iter 470 value 74.028787
## iter 480 value 74.027660
## iter 490 value 74.026896
## iter 500 value 74.025457
## iter 510 value 74.024232
## iter 520 value 74.023877
## iter 530 value 74.023804
## final value 74.023795
## converged
## # weights: 411
## initial value 3642.416982
## iter 10 value 620.759980
## iter 20 value 282.187633
## iter 30 value 279.317854
## iter 40 value 279.284241
## iter 50 value 279.255793
## iter 60 value 227.849145
## iter 70 value 185.171621
## iter 80 value 154.124452
## iter 90 value 120.894107
## iter 100 value 111.954712
## iter 110 value 108.216812
## iter 120 value 98.013405
## iter 130 value 96.088570
## iter 140 value 83.754969
## iter 150 value 78.740827
## iter 160 value 78.504604
## iter 170 value 76.762121
## iter 180 value 75.933718
## iter 190 value 75.587031
## iter 200 value 75.460884
## iter 210 value 75.290267
## iter 220 value 75.038743
## iter 230 value 74.857537
## iter 240 value 74.741840
## iter 250 value 74.680820
## iter 260 value 74.575207
## iter 270 value 74.523280
## iter 280 value 74.474206
## iter 290 value 74.444631
## iter 300 value 74.428090
## iter 310 value 74.417655
## iter 320 value 74.414468
## final value 74.413924
## converged
## # weights: 493
## initial value 3918.185245
## iter 10 value 486.028782
## iter 20 value 279.315025
## iter 30 value 279.259831
## iter 40 value 266.591781
## iter 50 value 141.136492
## iter 60 value 118.826191
## iter 70 value 111.363540
## iter 80 value 110.004602
## iter 90 value 107.282691
## iter 100 value 103.667818
## iter 110 value 103.122002
## iter 120 value 102.484526
## iter 130 value 101.440767
## iter 140 value 101.268829
## iter 150 value 101.210216
## iter 160 value 101.172827
## iter 170 value 101.148679
## iter 180 value 101.092514
## iter 190 value 101.048613
## iter 200 value 100.052470
## iter 210 value 99.119723
## iter 220 value 93.278212
## iter 230 value 83.405028
## iter 240 value 78.398126
## iter 250 value 76.796786
## iter 260 value 75.850506
## iter 270 value 75.569469
## iter 280 value 75.246808
## iter 290 value 74.813042
## iter 300 value 74.552092
## iter 310 value 74.273709
## iter 320 value 74.107693
## iter 330 value 73.984957
## iter 340 value 73.925573
## iter 350 value 73.891323
## iter 360 value 73.866210
## iter 370 value 73.850151
## iter 380 value 73.819813
## iter 390 value 73.787068
## iter 400 value 73.777305
## iter 410 value 73.772663
## iter 420 value 73.770963
## iter 430 value 73.769229
## iter 440 value 73.764571
## iter 450 value 73.759515
## iter 460 value 73.757367
## iter 470 value 73.755029
## iter 480 value 73.753332
## iter 490 value 73.752648
## iter 500 value 73.751550
## iter 510 value 73.748931
## iter 520 value 73.743085
## iter 530 value 73.740460
## iter 540 value 73.739033
## iter 550 value 73.737081
## iter 560 value 73.731067
## iter 570 value 73.726703
## iter 580 value 73.725166
## iter 590 value 73.723397
## iter 600 value 73.722063
## iter 610 value 73.721044
## iter 620 value 73.719210
## iter 630 value 73.716178
## iter 640 value 73.713317
## iter 650 value 73.712415
## iter 660 value 73.711968
## iter 670 value 73.711753
## iter 680 value 73.711536
## iter 690 value 73.711237
## iter 700 value 73.710904
## iter 710 value 73.710646
## iter 720 value 73.710457
## iter 730 value 73.710293
## iter 740 value 73.709635
## iter 750 value 73.708838
## iter 760 value 73.708461
## iter 770 value 73.708280
## iter 780 value 73.708205
## iter 790 value 73.708168
## final value 73.708117
## converged
## # weights: 575
## initial value 856.830359
## iter 10 value 303.242495
## iter 20 value 280.561889
## iter 30 value 279.030245
## iter 40 value 269.517519
## iter 50 value 269.032389
## iter 60 value 120.223734
## iter 70 value 95.973419
## iter 80 value 93.673604
## iter 90 value 90.080410
## iter 100 value 77.441527
## iter 110 value 76.626343
## iter 120 value 75.967978
## iter 130 value 75.346300
## iter 140 value 74.855825
## iter 150 value 74.815169
## iter 160 value 74.640302
## iter 170 value 74.627598
## iter 180 value 74.538465
## iter 190 value 74.374392
## iter 200 value 74.287632
## iter 210 value 74.181262
## iter 220 value 74.106834
## iter 230 value 74.042370
## iter 240 value 74.007708
## iter 250 value 73.981396
## iter 260 value 73.934788
## iter 270 value 73.880248
## iter 280 value 73.829476
## iter 290 value 73.796932
## iter 300 value 73.780359
## iter 310 value 73.757690
## iter 320 value 73.729119
## iter 330 value 73.718508
## iter 340 value 73.714161
## iter 350 value 73.708977
## iter 360 value 73.700103
## iter 370 value 73.685961
## iter 380 value 73.679062
## iter 390 value 73.676457
## iter 400 value 73.675413
## iter 410 value 73.674642
## iter 420 value 73.674139
## iter 430 value 73.673629
## iter 440 value 73.673197
## iter 450 value 73.672897
## iter 460 value 73.672458
## iter 470 value 73.671863
## iter 480 value 73.671464
## iter 490 value 73.671112
## iter 500 value 73.670765
## iter 510 value 73.670351
## iter 520 value 73.669855
## iter 530 value 73.669133
## iter 540 value 73.667231
## iter 550 value 73.666515
## iter 560 value 73.666113
## iter 570 value 73.665806
## iter 580 value 73.665567
## iter 590 value 73.665396
## iter 600 value 73.665129
## iter 610 value 73.664829
## iter 620 value 73.664648
## iter 630 value 73.664594
## iter 640 value 73.664558
## iter 650 value 73.664517
## iter 660 value 73.664423
## iter 670 value 73.664304
## iter 680 value 73.664262
## iter 690 value 73.664245
## iter 700 value 73.664231
## final value 73.664222
## converged
## # weights: 657
## initial value 8213.431926
## iter 10 value 1023.744300
## iter 20 value 312.717559
## iter 30 value 279.310931
## iter 40 value 278.662042
## iter 50 value 140.773082
## iter 60 value 104.261101
## iter 70 value 102.328922
## iter 80 value 101.616002
## iter 90 value 101.325750
## iter 100 value 101.270815
## iter 110 value 101.209601
## iter 120 value 101.177212
## iter 130 value 101.168997
## iter 140 value 101.167009
## iter 150 value 101.162859
## iter 160 value 101.157822
## iter 170 value 101.143740
## iter 180 value 100.342648
## iter 190 value 90.812854
## iter 200 value 83.593017
## iter 210 value 79.710753
## iter 220 value 77.390735
## iter 230 value 77.053964
## iter 240 value 76.524590
## iter 250 value 76.233476
## iter 260 value 75.882305
## iter 270 value 75.469594
## iter 280 value 75.321457
## iter 290 value 75.275697
## iter 300 value 75.161241
## iter 310 value 74.968066
## iter 320 value 74.892062
## iter 330 value 74.733082
## iter 340 value 74.687625
## iter 350 value 74.637374
## iter 360 value 74.520189
## iter 370 value 74.442273
## iter 380 value 74.354015
## iter 390 value 74.290729
## iter 400 value 74.231759
## iter 410 value 74.161868
## iter 420 value 74.111125
## iter 430 value 74.084695
## iter 440 value 74.073395
## iter 450 value 74.068042
## iter 460 value 74.063168
## iter 470 value 74.043799
## iter 480 value 73.956295
## iter 490 value 73.883420
## iter 500 value 73.855568
## iter 510 value 73.787222
## iter 520 value 73.754731
## iter 530 value 73.732003
## iter 540 value 73.718782
## iter 550 value 73.706727
## iter 560 value 73.701852
## iter 570 value 73.700531
## iter 580 value 73.700050
## iter 590 value 73.699918
## iter 600 value 73.699849
## iter 610 value 73.699639
## iter 620 value 73.699561
## final value 73.699559
## converged
## # weights: 739
## initial value 5619.309319
## iter 10 value 432.746052
## iter 20 value 279.257763
## iter 30 value 279.255350
## iter 40 value 279.206678
## iter 50 value 278.152539
## iter 60 value 273.417339
## iter 70 value 120.729187
## iter 80 value 104.208886
## iter 90 value 102.314156
## iter 100 value 101.819122
## iter 110 value 101.700410
## iter 120 value 101.342780
## iter 130 value 101.057719
## iter 140 value 100.826763
## iter 150 value 100.196549
## iter 160 value 98.731056
## iter 170 value 94.863541
## iter 180 value 87.505562
## iter 190 value 81.468478
## iter 200 value 78.318468
## iter 210 value 76.622601
## iter 220 value 75.799288
## iter 230 value 75.434286
## iter 240 value 75.085966
## iter 250 value 74.943407
## iter 260 value 74.857755
## iter 270 value 74.809663
## iter 280 value 74.773897
## iter 290 value 74.745526
## iter 300 value 74.685286
## iter 310 value 74.627948
## iter 320 value 74.575417
## iter 330 value 74.523726
## iter 340 value 74.486104
## iter 350 value 74.448150
## iter 360 value 74.417340
## iter 370 value 74.383648
## iter 380 value 74.354355
## iter 390 value 74.332541
## iter 400 value 74.315491
## iter 410 value 74.305315
## iter 420 value 74.298362
## iter 430 value 74.290933
## iter 440 value 74.283319
## iter 450 value 74.272259
## iter 460 value 74.248703
## iter 470 value 74.221653
## iter 480 value 74.190295
## iter 490 value 74.160033
## iter 500 value 74.129850
## iter 510 value 74.092338
## iter 520 value 74.068096
## iter 530 value 74.042616
## iter 540 value 74.004763
## iter 550 value 73.965977
## iter 560 value 73.938996
## iter 570 value 73.918877
## iter 580 value 73.901707
## iter 590 value 73.887226
## iter 600 value 73.875345
## iter 610 value 73.868081
## iter 620 value 73.856506
## iter 630 value 73.842278
## iter 640 value 73.837985
## iter 650 value 73.803026
## iter 660 value 73.759257
## iter 670 value 73.742592
## iter 680 value 73.741959
## final value 73.741905
## converged
## # weights: 821
## initial value 354.911611
## iter 10 value 279.296691
## iter 20 value 279.249313
## iter 30 value 279.239328
## iter 40 value 279.214994
## iter 50 value 279.097400
## iter 60 value 278.797605
## iter 70 value 277.322989
## iter 80 value 242.279349
## iter 90 value 156.952647
## iter 100 value 128.923047
## iter 110 value 116.278606
## iter 120 value 96.627689
## iter 130 value 93.515381
## iter 140 value 83.622621
## iter 150 value 80.496865
## iter 160 value 76.975480
## iter 170 value 76.490876
## iter 180 value 76.268197
## iter 190 value 75.940281
## iter 200 value 75.818712
## iter 210 value 75.758856
## iter 220 value 75.606596
## iter 230 value 75.571121
## iter 240 value 75.490892
## iter 250 value 75.393535
## iter 260 value 75.381588
## iter 270 value 75.353841
## iter 280 value 75.306527
## iter 290 value 75.271367
## iter 300 value 75.249236
## iter 310 value 75.241054
## iter 320 value 75.194880
## iter 330 value 75.145196
## iter 340 value 75.050598
## iter 350 value 74.962015
## iter 360 value 74.934005
## iter 370 value 74.856557
## iter 380 value 74.839257
## iter 390 value 74.789212
## iter 400 value 74.632425
## iter 410 value 74.396897
## iter 420 value 74.136796
## iter 430 value 73.992135
## iter 440 value 73.919512
## iter 450 value 73.859808
## iter 460 value 73.813955
## iter 470 value 73.783726
## iter 480 value 73.752009
## iter 490 value 73.741267
## iter 500 value 73.734490
## iter 510 value 73.729677
## iter 520 value 73.727028
## iter 530 value 73.725439
## iter 540 value 73.724424
## iter 550 value 73.723630
## iter 560 value 73.723109
## iter 570 value 73.722556
## iter 580 value 73.721678
## iter 590 value 73.719909
## iter 600 value 73.715969
## iter 610 value 73.713867
## iter 620 value 73.712862
## iter 630 value 73.712342
## iter 640 value 73.712070
## iter 650 value 73.711746
## iter 660 value 73.711234
## iter 670 value 73.710307
## iter 680 value 73.709223
## iter 690 value 73.708417
## iter 700 value 73.707798
## iter 710 value 73.707240
## iter 720 value 73.706742
## iter 730 value 73.706363
## iter 740 value 73.706035
## iter 750 value 73.705557
## iter 760 value 73.705074
## iter 770 value 73.704803
## iter 780 value 73.704689
## iter 790 value 73.704607
## iter 800 value 73.704468
## iter 810 value 73.704214
## iter 820 value 73.704035
## iter 830 value 73.703750
## iter 840 value 73.703392
## iter 850 value 73.702994
## iter 860 value 73.702825
## iter 870 value 73.702663
## iter 880 value 73.702501
## iter 890 value 73.702338
## iter 900 value 73.702137
## iter 910 value 73.701954
## iter 920 value 73.701647
## iter 930 value 73.701207
## iter 940 value 73.700854
## iter 950 value 73.700541
## iter 960 value 73.700301
## iter 970 value 73.700046
## iter 980 value 73.699853
## iter 990 value 73.699601
## iter1000 value 73.699388
## final value 73.699388
## stopped after 1000 iterations
nnet_MSE2
## [1] 5504.759 5391.998 5577.465 5419.372 5498.963 5439.971 5416.667
## [8] 5423.153 5421.526 5438.779
nnet_MSE3 <- vector()
for(i in 1:10){
reg_nnet3 <- nnet(registered/1000 ~ year + wday + factor(weather) + atemp , data = day_train, size = (2*i), maxit = 1000, decay = 0.1)
predict3_reg <- predict(reg_nnet3, day_test, type ="raw") * 1000
nnet_MSE3[i] <- mean((predict3_reg - day_test$registered)^2)
}
## # weights: 15
## initial value 2676.656939
## iter 10 value 299.990070
## iter 20 value 279.561892
## iter 30 value 279.328711
## iter 40 value 279.041973
## iter 50 value 277.625684
## iter 60 value 265.331563
## iter 70 value 246.631694
## iter 80 value 244.427093
## iter 90 value 244.188558
## iter 100 value 244.139011
## iter 110 value 244.137610
## final value 244.137599
## converged
## # weights: 29
## initial value 4201.146965
## iter 10 value 571.770907
## iter 20 value 279.516002
## iter 30 value 278.913038
## iter 40 value 255.043617
## iter 50 value 245.121214
## iter 60 value 244.154609
## iter 70 value 244.138219
## iter 80 value 244.133787
## final value 244.133710
## converged
## # weights: 43
## initial value 3900.442444
## iter 10 value 403.818109
## iter 20 value 279.323836
## final value 279.323745
## converged
## # weights: 57
## initial value 2082.027220
## iter 10 value 471.384347
## iter 20 value 279.283500
## iter 30 value 279.230008
## iter 40 value 262.930911
## iter 50 value 246.153275
## iter 60 value 244.256800
## iter 70 value 244.156278
## iter 80 value 244.124262
## iter 90 value 244.123976
## iter 100 value 244.123479
## iter 110 value 244.123158
## iter 120 value 244.122488
## iter 130 value 244.122119
## iter 140 value 244.121690
## iter 150 value 244.121556
## final value 244.121543
## converged
## # weights: 71
## initial value 1873.665514
## iter 10 value 344.860743
## iter 20 value 280.430772
## iter 30 value 279.296567
## iter 40 value 279.266043
## iter 50 value 278.897444
## iter 60 value 269.745336
## iter 70 value 249.720006
## iter 80 value 244.637661
## iter 90 value 244.315855
## iter 100 value 244.164056
## iter 110 value 244.127910
## iter 120 value 244.122174
## iter 130 value 244.121995
## iter 140 value 244.121803
## iter 150 value 244.121657
## final value 244.121543
## converged
## # weights: 85
## initial value 284.014497
## iter 10 value 279.258674
## iter 20 value 279.065798
## iter 30 value 272.183273
## iter 40 value 248.121113
## iter 50 value 244.523989
## iter 60 value 244.283802
## iter 70 value 244.162299
## iter 80 value 244.152285
## iter 90 value 244.137724
## iter 100 value 244.134700
## iter 110 value 244.125928
## iter 120 value 244.120448
## iter 130 value 244.119017
## iter 140 value 244.118670
## iter 150 value 244.118454
## iter 160 value 244.118346
## final value 244.118260
## converged
## # weights: 99
## initial value 2232.907473
## iter 10 value 551.695703
## iter 20 value 279.257987
## iter 30 value 279.250800
## iter 40 value 279.169374
## iter 50 value 254.933700
## iter 60 value 245.790964
## iter 70 value 244.323619
## iter 80 value 244.176755
## iter 90 value 244.123613
## final value 244.123292
## converged
## # weights: 113
## initial value 281.821425
## iter 10 value 279.253244
## iter 20 value 279.091271
## iter 30 value 278.727077
## iter 40 value 259.414438
## iter 50 value 250.990656
## iter 60 value 246.184833
## iter 70 value 244.146826
## iter 80 value 244.138312
## iter 90 value 244.123032
## iter 100 value 244.122630
## final value 244.122558
## converged
## # weights: 127
## initial value 1288.958759
## iter 10 value 394.751135
## iter 20 value 281.576107
## iter 30 value 278.943562
## iter 40 value 257.573785
## iter 50 value 244.982233
## iter 60 value 244.475798
## iter 70 value 244.233752
## iter 80 value 244.128046
## iter 90 value 244.120746
## iter 100 value 244.120638
## iter 110 value 244.120348
## iter 120 value 244.120081
## iter 130 value 244.119759
## final value 244.119661
## converged
## # weights: 141
## initial value 7675.880627
## iter 10 value 949.855056
## iter 20 value 291.678038
## iter 30 value 279.393832
## iter 40 value 279.262601
## iter 50 value 279.190621
## iter 60 value 278.758746
## iter 70 value 268.910041
## iter 80 value 259.357362
## iter 90 value 248.272663
## iter 100 value 244.568478
## iter 110 value 244.246029
## iter 120 value 244.130022
## iter 130 value 244.120722
## iter 140 value 244.119848
## final value 244.119744
## converged
nnet_MSE3
## [1] 19690.31 19690.10 22801.62 19689.10 19689.71 19688.81 19689.67
## [8] 19689.34 19689.61 19689.63
I scale my variables as otherwise test output remains constant. I do this by dividing by registered users by 1000 to get it in the 0-1 range and then multiply my predicted values by 1000 later.
For the final neural network I chose the variables working day and year for the time variables, and atemp and weather for the temp variables. From previous questions we have seen these play an important role in predicted the total number of registered users at any given time.
Comparing out three models we see that the smallest MSE is for the model which used only time variables and the largest is for the model with mixed variables (the one I chose). This seems to imply that registered users are more versatile when it comes to changes in weather on a daily basis but plan their commute across the year so as to avoid biking in the winter.
library(leaps)
regfit.fwd <- regsubsets (registered ~ . - casual, data=day ,nvmax=10, method ="forward")
reg_fwd_sum <- summary(regfit.fwd)
reg_fwd_sum
## Subset selection object
## Call: regsubsets.formula(registered ~ . - casual, data = day, nvmax = 10,
## method = "forward")
## 10 Variables (and intercept)
## Forced in Forced out
## season FALSE FALSE
## year FALSE FALSE
## month FALSE FALSE
## wday FALSE FALSE
## hr FALSE FALSE
## temp FALSE FALSE
## atemp FALSE FALSE
## hum FALSE FALSE
## windspeed FALSE FALSE
## weather FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: forward
## season year month wday hr temp atemp hum windspeed weather
## 1 ( 1 ) " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " "*" "*" " " " " " " " "
## 3 ( 1 ) " " "*" " " " " "*" "*" " " " " " " " "
## 4 ( 1 ) " " "*" " " " " "*" "*" " " "*" " " " "
## 5 ( 1 ) "*" "*" " " " " "*" "*" " " "*" " " " "
## 6 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" " " " "
## 7 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" " "
## 9 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
regfit.bwd <- regsubsets (registered ~ . - casual , data=day ,nvmax=10, method ="backward")
reg_bwd_sum <- summary(regfit.bwd)
reg_bwd_sum
## Subset selection object
## Call: regsubsets.formula(registered ~ . - casual, data = day, nvmax = 10,
## method = "backward")
## 10 Variables (and intercept)
## Forced in Forced out
## season FALSE FALSE
## year FALSE FALSE
## month FALSE FALSE
## wday FALSE FALSE
## hr FALSE FALSE
## temp FALSE FALSE
## atemp FALSE FALSE
## hum FALSE FALSE
## windspeed FALSE FALSE
## weather FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: backward
## season year month wday hr temp atemp hum windspeed weather
## 1 ( 1 ) " " " " " " " " "*" " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " "*" " " "*" " " " " " "
## 3 ( 1 ) " " "*" " " " " "*" " " "*" " " " " " "
## 4 ( 1 ) " " "*" " " " " "*" " " "*" "*" " " " "
## 5 ( 1 ) "*" "*" " " " " "*" " " "*" "*" " " " "
## 6 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" " " " "
## 7 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" " "
## 8 ( 1 ) "*" "*" " " "*" "*" " " "*" "*" "*" "*"
## 9 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(reg_fwd_sum$rss ,xlab="Number of Variables with Forward Selection",ylab="RSS", type="l")
plot(reg_fwd_sum$adjr2 ,xlab="Number of Variables with Forward Selection", ylab="Adjusted RSq",type="l")
plot(reg_bwd_sum$rss ,xlab="Number of Variables with Backward Selection",ylab="RSS", type="l")
plot(reg_bwd_sum$adjr2 ,xlab="Number of Variables with Backward Selection", ylab="Adjusted RSq",type="l")
From our plots we see that the best model is for 6 variables with both forward and backward subset selection. The 6 variables in this model are the same for both models and include - season, year, wday, hr, atemp and hum.
# a
library(readr)
covtype_train <- read_csv("covtype_train.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
covtype_test <- read_csv("covtype_test.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
covtype_train$X1 <- NULL
covtype_test$X1 <- NULL
covtype_train$cover[covtype_train$cover == 2] <- 0
covtype_train$cover[covtype_train$cover == 3] <- 1
covtype_test$cover[covtype_test$cover == 2] <- 0
covtype_test$cover[covtype_test$cover == 3] <- 1
glm_cover <- glm(as.factor(cover) ~ ., data = covtype_train, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_cover)
##
## Call:
## glm(formula = as.factor(cover) ~ ., family = "binomial", data = covtype_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.408 0.000 0.000 0.000 2.847
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.093e+12 4.232e+12 -2.580e-01 0.79626
## elev -1.640e-02 1.194e-03 -1.373e+01 < 2e-16 ***
## aspect 6.926e-04 1.071e-03 6.470e-01 0.51785
## slope -4.658e-02 3.672e-02 -1.269e+00 0.20462
## h_dist_hydro 5.546e-03 8.090e-04 6.856e+00 7.08e-12 ***
## v_dist_hydro 9.493e-04 1.947e-03 4.880e-01 0.62581
## h_dist_road 4.021e-04 1.349e-04 2.982e+00 0.00287 **
## hillshade_9 -6.072e-02 3.699e-02 -1.642e+00 0.10067
## hillshade_12 6.440e-02 3.171e-02 2.031e+00 0.04226 *
## hillshade_3 -6.445e-02 3.134e-02 -2.057e+00 0.03970 *
## h_dist_fire 1.038e-04 1.487e-04 6.980e-01 0.48508
## wild1 -2.384e+01 1.975e+04 -1.000e-03 0.99904
## wild2 -4.504e+15 4.115e+06 -1.095e+09 < 2e-16 ***
## wild3 -5.379e-01 3.346e-01 -1.608e+00 0.10792
## wild4 NA NA NA NA
## soil1 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil2 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil3 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil4 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil5 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil6 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil7 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil8 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil9 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil10 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil11 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil12 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil13 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil14 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil15 NA NA NA NA
## soil16 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil17 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil18 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil19 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil20 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil21 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil22 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil23 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil24 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil25 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil26 -4.503e+15 4.232e+12 -1.064e+03 < 2e-16 ***
## soil27 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil28 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil29 -4.503e+15 4.232e+12 -1.064e+03 < 2e-16 ***
## soil30 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil31 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil32 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil33 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil34 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil35 NA NA NA NA
## soil36 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil37 NA NA NA NA
## soil38 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil39 1.093e+12 4.232e+12 2.580e-01 0.79626
## soil40 1.093e+12 4.232e+12 2.580e-01 0.79626
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6992.8 on 9999 degrees of freedom
## Residual deviance: 1109.5 on 9949 degrees of freedom
## AIC: 1211.5
##
## Number of Fisher Scoring iterations: 25
cover_pred_train <- predict(glm_cover, covtype_train, type = "response")>0.5
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
table(actual = covtype_train$cover, predicted_train = cover_pred_train)
## predicted_train
## actual FALSE TRUE
## 0 8777 108
## 1 114 1001
cover_pred_test <- predict(glm_cover, covtype_test, type = "response")>0.5
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
table(actual = covtype_test$cover, predicted_test = cover_pred_test)
## predicted_test
## actual FALSE TRUE
## 0 8717 128
## 1 124 1031
# Using subset of significant predictors
glm_cover2 <- glm(as.factor(cover) ~ elev + h_dist_hydro + h_dist_road + hillshade_12 + hillshade_3 +
wild2 + soil26 + soil29, data = covtype_train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm_cover2)
##
## Call:
## glm(formula = as.factor(cover) ~ elev + h_dist_hydro + h_dist_road +
## hillshade_12 + hillshade_3 + wild2 + soil26 + soil29, family = "binomial",
## data = covtype_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2831 -0.0972 -0.0127 0.0000 3.2586
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.382e+01 1.775e+00 24.685 < 2e-16 ***
## elev -2.135e-02 7.972e-04 -26.775 < 2e-16 ***
## h_dist_hydro 5.708e-03 4.303e-04 13.263 < 2e-16 ***
## h_dist_road 6.611e-04 8.764e-05 7.543 4.58e-14 ***
## hillshade_12 5.001e-02 4.176e-03 11.975 < 2e-16 ***
## hillshade_3 -1.867e-02 2.043e-03 -9.141 < 2e-16 ***
## wild2 -8.958e+00 9.211e+02 -0.010 0.992
## soil26 -1.714e+01 1.985e+03 -0.009 0.993
## soil29 -1.658e+01 2.954e+02 -0.056 0.955
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6992.8 on 9999 degrees of freedom
## Residual deviance: 1807.5 on 9991 degrees of freedom
## AIC: 1825.5
##
## Number of Fisher Scoring iterations: 19
cover_pred_train2 <- predict(glm_cover2, covtype_train, type = "response")>0.13
table(actual = covtype_train$cover, predicted_train = cover_pred_train2)
## predicted_train
## actual FALSE TRUE
## 0 8332 553
## 1 73 1042
cover_pred_test2 <- predict(glm_cover2, covtype_test, type = "response")>0.13
table(actual = covtype_test$cover, predicted_test = cover_pred_test2)
## predicted_test
## actual FALSE TRUE
## 0 8238 607
## 1 69 1086
I chose a threshold level of 0.13 and ran the logistic regression again on a subset of the significant predictods. This gives approximately the same sensitivity (6.55%) and specificity (6.22%) on the training data.
On the test data this same threshold gives us a sensitivity value of 5.97% and a specificity value of 6.86%.
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
# Downsample to decrease SVM computation amount
# downsamp <- seq(1, ncol(covtype_train), 4)
# covtype_train2 <- covtype_train[ downsamp]
# # covtype_train2$train.y <- mnist_train$train.y
# covtype_test2 <- covtype_test[ downsamp]
# # mnist_test2$test.y <- covtype_test$test.y
covtype_train2 <- covtype_train
covtype_test2 <- covtype_test
# Drop soil variables to make computation easier
covtype_train2 <- covtype_train[ -c(15:54) ]
covtype_test2 <- covtype_test2[ -c(15:54) ]
# Run SVM
which(apply(covtype_train2, 2, var) == 0)
## named integer(0)
which(apply(covtype_test2, 2, var) == 0)
## named integer(0)
covtype_train = subset(covtype_train, select = -c(soil15,soil35,soil37) )
covtype_test = subset(covtype_test, select = -c(soil15,soil35,soil37))
# covtype_train2$cover <- covtype_train$cover
# covtype_train2$cover <- covtype_train$cover
tune_cover_radial <- tune.svm(as.factor(cover) ~ ., data = covtype_train2, kernel = "radial", gamma = c(1, 0.1, 0.01, 0.001), cost = c(1, 0.1, 0.01))
summary(tune_cover_radial)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.1 1
##
## - best performance: 0.0263
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 1.000 1.00 0.0301 0.005858517
## 2 0.100 1.00 0.0263 0.005812821
## 3 0.010 1.00 0.0335 0.006948221
## 4 0.001 1.00 0.0531 0.008912538
## 5 1.000 0.10 0.1015 0.007677529
## 6 0.100 0.10 0.0363 0.008179242
## 7 0.010 0.10 0.0531 0.008912538
## 8 0.001 0.10 0.0531 0.008912538
## 9 1.000 0.01 0.1115 0.009360081
## 10 0.100 0.01 0.0629 0.008999383
## 11 0.010 0.01 0.0531 0.008912538
## 12 0.001 0.01 0.1115 0.009360081
cover_radial_fit <- svm(as.factor(cover) ~ ., data = covtype_train, kernel = "radial", gamma = 0.1, cost=1)
# Training Error
table(true = covtype_train$cover, predicted = cover_radial_fit$fitted)
## predicted
## true 0 1
## 0 8809 76
## 1 89 1026
# Test Error
pred_cover_radial_fit <- predict(cover_radial_fit , covtype_test)
table(true=covtype_train$cover, predicted = pred_cover_radial_fit)
## predicted
## true 0 1
## 0 7863 1022
## 1 988 127
My computer was unable to run the SVM model for the whole dataset so I dropped the soil variables as they seem to have relatively little predictive power from our logistic regression in part 5. I completed the tuning process and then ran the best SVM with the full dataset.
The misclassification rate is 1.65% for the training data, but the test data misclassification rate is 20.1%. This indicates our SVM is overfitting the data and does not perform well on untrained data.
library(tree)
## Warning: package 'tree' was built under R version 3.4.4
# attach(covtype_test)
# Cover <- ifelse(Purchase == "MM",1,0)
tree.cover <- tree(as.factor(cover) ~ ., data = covtype_train)
summary(tree.cover)
##
## Classification tree:
## tree(formula = as.factor(cover) ~ ., data = covtype_train)
## Variables actually used in tree construction:
## [1] "elev" "wild1" "soil4" "soil2"
## Number of terminal nodes: 9
## Residual mean deviance: 0.153 = 1529 / 9991
## Misclassification error rate: 0.0248 = 248 / 10000
plot(tree.cover)
text(tree.cover ,pretty =0)
tree.cover.pred <- predict(tree.cover, covtype_test, type="class")
table(actual = covtype_test$cover, predicted = tree.cover.pred)
## predicted
## actual 0 1
## 0 8741 104
## 1 191 964
# CV and Pruning
set.seed (3)
cv.tree.cover <- cv.tree(tree.cover ,FUN=prune.misclass )
cv.tree.cover
## $size
## [1] 9 8 6 5 3 2 1
##
## $dev
## [1] 289 289 316 371 410 698 933
##
## $k
## [1] -Inf 0.0 15.5 43.0 58.5 280.0 396.0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
prune.tree.cover <- prune.misclass(tree.cover, best=8)
plot(prune.tree.cover)
text(prune.tree.cover, pretty=0)
tree.cover.pred2 <- predict(prune.tree.cover, covtype_test, type="class")
table(actual = covtype_test$cover, predicted = tree.cover.pred2)
## predicted
## actual 0 1
## 0 8741 104
## 1 191 964
Classification error rate for pruned tree = (104 + 191 / 10000) = 2.95%
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
randForest.cover <- randomForest(as.factor(cover) ~ . , data = covtype_train, mtry= 7 , ntree=20)
randForest.cover
##
## Call:
## randomForest(formula = as.factor(cover) ~ ., data = covtype_train, mtry = 7, ntree = 20)
## Type of random forest: classification
## Number of trees: 20
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 2.07%
## Confusion matrix:
## 0 1 class.error
## 0 8785 99 0.01114363
## 1 108 1007 0.09686099
importance(randForest.cover)
## MeanDecreaseGini
## elev 3.849620e+02
## aspect 3.927054e+01
## slope 6.127884e+01
## h_dist_hydro 3.164131e+01
## v_dist_hydro 3.047529e+01
## h_dist_road 9.649719e+01
## hillshade_9 4.431769e+01
## hillshade_12 4.679115e+01
## hillshade_3 4.207037e+01
## h_dist_fire 7.276937e+01
## wild1 7.462885e+01
## wild2 7.962385e-01
## wild3 4.575683e+01
## wild4 3.655203e+02
## soil1 7.060504e+00
## soil2 1.370932e+02
## soil3 1.649194e+01
## soil4 1.332726e+02
## soil5 5.243146e+00
## soil6 4.317663e+01
## soil7 0.000000e+00
## soil8 0.000000e+00
## soil9 0.000000e+00
## soil10 5.912911e+01
## soil11 5.972824e+00
## soil12 8.267943e-01
## soil13 7.703536e+00
## soil14 4.285099e-01
## soil16 5.239645e-03
## soil17 8.606445e-01
## soil18 7.043711e-01
## soil19 4.306418e-02
## soil20 6.493282e-02
## soil21 0.000000e+00
## soil22 3.476340e+00
## soil23 4.797876e+00
## soil24 2.642385e+00
## soil25 0.000000e+00
## soil26 3.703704e-04
## soil27 1.549107e-02
## soil28 1.026579e+00
## soil29 4.885890e+00
## soil30 7.244359e-01
## soil31 2.022117e+00
## soil32 1.000551e+01
## soil33 1.818116e+01
## soil34 9.415283e-02
## soil36 0.000000e+00
## soil38 2.539277e-04
## soil39 1.735983e-01
## soil40 0.000000e+00
yhat.randC = predict(randForest.cover ,newdata = covtype_test)
table(actual = covtype_test$cover, predicted = yhat.randC)
## predicted
## actual 0 1
## 0 8732 113
## 1 109 1046
# Important 10 variable model
randForest.cover2 <- randomForest(as.factor(cover) ~ elev + aspect + slope + h_dist_hydro + v_dist_hydro + h_dist_road + hillshade_9 + hillshade_12 + hillshade_3 + h_dist_fire , data = covtype_train, mtry= 3 , ntree=20)
randForest.cover2
##
## Call:
## randomForest(formula = as.factor(cover) ~ elev + aspect + slope + h_dist_hydro + v_dist_hydro + h_dist_road + hillshade_9 + hillshade_12 + hillshade_3 + h_dist_fire, data = covtype_train, mtry = 3, ntree = 20)
## Type of random forest: classification
## Number of trees: 20
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 2.7%
## Confusion matrix:
## 0 1 class.error
## 0 8788 95 0.01069459
## 1 175 940 0.15695067
yhat.randC2 = predict(randForest.cover2 ,newdata = covtype_test)
table(actual = covtype_test$cover, predicted = yhat.randC2)
## predicted
## actual 0 1
## 0 8752 93
## 1 187 968
Note: For a random forest classification problem we use mtry = sqrt(p).
From our test data we see that the misclassification rate is 2.95% for random forest with 10 most important variables and 2.85% for the full model random forest, implying that the latter performs better even on untrained data.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load("mnist_all.RData")
df_test <- as.data.frame(test$x)
df_test$Digit <- test$y
set.seed(2)
Q9_sample <- sample(1:nrow(df_test), 1000)
df_test_q9 <- df_test[Q9_sample,]
hc.complete <- hclust(dist(df_test_q9), method="complete")
plot(hc.complete, main="Complete Linkage" , xlab="", sub="",cex =.9)
df_test_q9$ten_clus <- cutree(hc.complete, 10)
df_test_q9$ten_clus2 <- cutree(hc.complete, 5)
df_test_q9$ten_clus3 <- cutree(hc.complete, 15)
# Cross Tab Digit and Cluster Values
table(Digit = df_test_q9$Digit, Cluster = df_test_q9$ten_clus)
## Cluster
## Digit 1 2 3 4 5 6 7 8 9 10
## 0 15 8 0 1 0 0 42 0 0 31
## 1 54 43 0 2 0 0 0 0 0 0
## 2 12 4 12 76 0 0 1 0 2 0
## 3 5 55 23 1 1 10 8 0 6 0
## 4 51 43 0 0 12 0 0 0 0 0
## 5 27 31 5 0 1 16 3 0 1 0
## 6 14 9 0 39 1 0 0 28 0 0
## 7 62 12 0 1 45 0 1 0 0 0
## 8 38 37 1 1 0 3 1 0 5 0
## 9 40 44 0 0 15 0 1 0 0 0
table(Digit = df_test_q9$Digit, Cluster = df_test_q9$ten_clus2)
## Cluster
## Digit 1 2 3 4 5
## 0 15 8 1 42 31
## 1 54 43 2 0 0
## 2 26 4 76 1 0
## 3 34 56 1 18 0
## 4 51 55 0 0 0
## 5 33 32 0 19 0
## 6 14 10 67 0 0
## 7 62 57 1 1 0
## 8 44 37 1 4 0
## 9 40 59 0 1 0
table(Digit = df_test_q9$Digit, Cluster = df_test_q9$ten_clus3)
## Cluster
## Digit 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 0 4 1 0 0 1 0 7 0 7 35 0 0 0 11 31
## 1 54 43 0 1 1 0 0 0 0 0 0 0 0 0 0
## 2 12 4 12 40 20 0 0 0 0 1 16 0 2 0 0
## 3 5 55 23 1 0 1 0 10 0 8 0 0 6 0 0
## 4 51 16 0 0 0 12 27 0 0 0 0 0 0 0 0
## 5 22 30 5 0 0 1 1 16 0 3 0 0 1 5 0
## 6 12 3 0 0 39 1 6 0 0 0 0 28 0 2 0
## 7 62 11 0 0 0 45 1 0 0 1 1 0 0 0 0
## 8 38 34 1 1 0 0 3 3 0 1 0 0 5 0 0
## 9 40 34 0 0 0 15 10 0 0 1 0 0 0 0 0
# Split dataset by cluster
# split(df_test_q9, df_test_q9$ten_clus)
# Calculate mean value for each column by cluster
cluster_means <- matrix(nrow = 10, ncol = 788)
cluster_means <- df_test_q9 %>%
group_by(ten_clus) %>%
summarise_all("mean")
cluster_means <- as.data.frame(cluster_means)
# Create function to plot digits
plot_digit <- function(j){
arr784 <- as.numeric(cluster_means[j,1:784])
col=gray(12:1/12)
image(matrix(arr784, nrow=28)[,28:1], col=col,
main = paste("this is from cluster ",cluster_means$labels[j]))
}
for(i in 1:10){
plot_digit(i)
}
(b). We can cut the dendogram at the a height of around 3500 and then we would have approximately 10 clusters, which is what we know our data has. However if we look at the leaves we see that these numbers of these seem to be overlapping and they do not seem to be seperated correctly if we were to add a cut at height = 3500. In this case even if we have ten clusters there will be a large number of misclassifications. For this reason I feel that the dendogram does not have compelling evidence about the correct number of “clusters”.
(c).I used a cross-tabulation to see the most common digit for each cluster and see which digits tend to clump together. Using a hierarchical clustering of k = 10, we can see some clusters contain only one digit (such as digits 0 and 6), but no digit is perfectly seperated into its own cluster. Varying the level of k still leads to imperfect clustering, with cluster 1 conating the most variation in terms of digits.
I choose to use 10 clusters and then split the data by each cluster. After this I calculate the mean of each variable by the cluster to see what the average value of that pixel is for the cluster before plotting it.
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 3.4.4
which(apply(df_test, 2, var) == 0)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## V31 V32 V33 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59 V60 V61
## 31 32 33 50 51 52 53 54 55 56 57 58 59 60 61
## V82 V83 V84 V85 V86 V87 V88 V112 V113 V114 V115 V140 V141 V168 V169
## 82 83 84 85 86 87 88 112 113 114 115 140 141 168 169
## V170 V197 V225 V337 V364 V365 V392 V393 V394 V420 V421 V422 V449 V477 V478
## 170 197 225 337 364 365 392 393 394 420 421 422 449 477 478
## V505 V533 V561 V588 V589 V590 V616 V617 V618 V644 V645 V672 V673 V674 V675
## 505 533 561 588 589 590 616 617 618 644 645 672 673 674 675
## V699 V700 V701 V702 V703 V727 V728 V729 V730 V731 V732 V754 V755 V756 V757
## 699 700 701 702 703 727 728 729 730 731 732 754 755 756 757
## V758 V759 V760 V761 V762 V779 V780 V781 V782 V783 V784
## 758 759 760 761 762 779 780 781 782 783 784
df_test <- df_test[ - as.numeric(which(apply(df_test, 2, var) == 0))]
# a
pr.mnist <- prcomp(df_test, scale=TRUE)
pr.var <- pr.mnist$sdev ^2
pve <- pr.var/sum(pr.var)
pve
## [1] 6.199824e-02 4.261971e-02 4.041900e-02 3.226216e-02 2.771465e-02
## [6] 2.408527e-02 2.044219e-02 1.880442e-02 1.670122e-02 1.515360e-02
## [11] 1.470044e-02 1.310275e-02 1.253967e-02 1.192863e-02 1.143425e-02
## [16] 1.093475e-02 1.031490e-02 9.927439e-03 9.457467e-03 9.102562e-03
## [21] 8.882116e-03 8.659025e-03 8.467488e-03 8.244449e-03 7.854211e-03
## [26] 7.708808e-03 7.557945e-03 7.325687e-03 7.078694e-03 6.824527e-03
## [31] 6.677280e-03 6.532530e-03 6.462752e-03 6.212479e-03 6.034448e-03
## [36] 5.917179e-03 5.812251e-03 5.723904e-03 5.648266e-03 5.521146e-03
## [41] 5.510552e-03 5.472601e-03 5.331640e-03 5.191785e-03 5.125071e-03
## [46] 4.986125e-03 4.978993e-03 4.884685e-03 4.781582e-03 4.691985e-03
## [51] 4.607179e-03 4.538054e-03 4.452305e-03 4.396033e-03 4.333757e-03
## [56] 4.231124e-03 4.189762e-03 4.137308e-03 4.110474e-03 4.050560e-03
## [61] 4.001875e-03 3.859565e-03 3.812457e-03 3.748760e-03 3.659602e-03
## [66] 3.605534e-03 3.567652e-03 3.499439e-03 3.464697e-03 3.436632e-03
## [71] 3.353172e-03 3.333730e-03 3.297193e-03 3.265675e-03 3.236521e-03
## [76] 3.184420e-03 3.141815e-03 3.089541e-03 3.075042e-03 3.012812e-03
## [81] 2.976570e-03 2.908319e-03 2.870665e-03 2.843684e-03 2.808937e-03
## [86] 2.748883e-03 2.720344e-03 2.689560e-03 2.664859e-03 2.630816e-03
## [91] 2.585594e-03 2.541446e-03 2.519489e-03 2.479986e-03 2.440511e-03
## [96] 2.427887e-03 2.378737e-03 2.346959e-03 2.313464e-03 2.306443e-03
## [101] 2.281961e-03 2.263603e-03 2.228423e-03 2.213308e-03 2.156639e-03
## [106] 2.134596e-03 2.118705e-03 2.083810e-03 2.076818e-03 2.044565e-03
## [111] 2.042761e-03 2.025927e-03 1.968989e-03 1.958587e-03 1.941742e-03
## [116] 1.926854e-03 1.895644e-03 1.878429e-03 1.823603e-03 1.787200e-03
## [121] 1.776795e-03 1.724767e-03 1.714638e-03 1.682682e-03 1.668393e-03
## [126] 1.656691e-03 1.645439e-03 1.627467e-03 1.601676e-03 1.595241e-03
## [131] 1.584143e-03 1.561710e-03 1.553411e-03 1.533416e-03 1.522532e-03
## [136] 1.499068e-03 1.493024e-03 1.491014e-03 1.468708e-03 1.460821e-03
## [141] 1.450795e-03 1.436661e-03 1.409213e-03 1.394010e-03 1.377586e-03
## [146] 1.375125e-03 1.360772e-03 1.354752e-03 1.337095e-03 1.320273e-03
## [151] 1.302467e-03 1.295328e-03 1.290134e-03 1.281180e-03 1.270603e-03
## [156] 1.250429e-03 1.237924e-03 1.217015e-03 1.205446e-03 1.179075e-03
## [161] 1.170610e-03 1.160620e-03 1.151563e-03 1.134708e-03 1.124373e-03
## [166] 1.118873e-03 1.109930e-03 1.083950e-03 1.076432e-03 1.073541e-03
## [171] 1.063906e-03 1.057469e-03 1.034596e-03 1.022580e-03 1.012353e-03
## [176] 9.893284e-04 9.825657e-04 9.768988e-04 9.694012e-04 9.667898e-04
## [181] 9.522660e-04 9.419396e-04 9.353950e-04 9.248052e-04 9.213565e-04
## [186] 9.157441e-04 9.018511e-04 8.941395e-04 8.805470e-04 8.751303e-04
## [191] 8.679617e-04 8.528464e-04 8.480460e-04 8.445352e-04 8.331361e-04
## [196] 8.283672e-04 8.178630e-04 8.082486e-04 7.940563e-04 7.859470e-04
## [201] 7.844300e-04 7.790344e-04 7.681772e-04 7.600104e-04 7.572933e-04
## [206] 7.483803e-04 7.361123e-04 7.314506e-04 7.261947e-04 7.175517e-04
## [211] 7.108677e-04 7.023430e-04 6.977069e-04 6.837937e-04 6.797393e-04
## [216] 6.767524e-04 6.689022e-04 6.657317e-04 6.578015e-04 6.466329e-04
## [221] 6.401814e-04 6.395852e-04 6.332832e-04 6.283632e-04 6.227158e-04
## [226] 6.209587e-04 6.084430e-04 6.052650e-04 5.983929e-04 5.966517e-04
## [231] 5.905252e-04 5.859358e-04 5.787889e-04 5.751876e-04 5.729030e-04
## [236] 5.692548e-04 5.601560e-04 5.513799e-04 5.464380e-04 5.398128e-04
## [241] 5.364946e-04 5.329697e-04 5.310695e-04 5.244791e-04 5.220754e-04
## [246] 5.164440e-04 5.132614e-04 5.057191e-04 5.009332e-04 4.964596e-04
## [251] 4.926785e-04 4.851165e-04 4.827462e-04 4.785159e-04 4.698508e-04
## [256] 4.658311e-04 4.612191e-04 4.609344e-04 4.568334e-04 4.539229e-04
## [261] 4.520027e-04 4.478689e-04 4.425840e-04 4.393612e-04 4.361667e-04
## [266] 4.314865e-04 4.267199e-04 4.235610e-04 4.195761e-04 4.163677e-04
## [271] 4.133548e-04 4.070288e-04 4.049531e-04 4.015810e-04 3.981422e-04
## [276] 3.960722e-04 3.924997e-04 3.909682e-04 3.889275e-04 3.848352e-04
## [281] 3.817982e-04 3.794734e-04 3.729054e-04 3.676626e-04 3.666450e-04
## [286] 3.635195e-04 3.587359e-04 3.575298e-04 3.548252e-04 3.518989e-04
## [291] 3.478298e-04 3.438942e-04 3.419607e-04 3.392417e-04 3.389817e-04
## [296] 3.371644e-04 3.337319e-04 3.313944e-04 3.261888e-04 3.236395e-04
## [301] 3.212991e-04 3.186003e-04 3.141867e-04 3.128938e-04 3.116658e-04
## [306] 3.076821e-04 3.051521e-04 3.027696e-04 3.026339e-04 3.002148e-04
## [311] 2.975710e-04 2.925877e-04 2.911952e-04 2.893052e-04 2.871546e-04
## [316] 2.856044e-04 2.843520e-04 2.815630e-04 2.795801e-04 2.786229e-04
## [321] 2.756490e-04 2.735090e-04 2.698462e-04 2.679113e-04 2.673264e-04
## [326] 2.652687e-04 2.637343e-04 2.604485e-04 2.579550e-04 2.557514e-04
## [331] 2.542842e-04 2.507213e-04 2.488205e-04 2.464813e-04 2.444966e-04
## [336] 2.428019e-04 2.408209e-04 2.383759e-04 2.371553e-04 2.351999e-04
## [341] 2.333574e-04 2.319711e-04 2.312510e-04 2.276413e-04 2.258016e-04
## [346] 2.253496e-04 2.238731e-04 2.226528e-04 2.207859e-04 2.187684e-04
## [351] 2.170464e-04 2.164116e-04 2.148670e-04 2.131621e-04 2.125376e-04
## [356] 2.103897e-04 2.091137e-04 2.084667e-04 2.059472e-04 2.052837e-04
## [361] 2.030333e-04 2.019322e-04 1.991480e-04 1.985217e-04 1.972981e-04
## [366] 1.968001e-04 1.962477e-04 1.951256e-04 1.942610e-04 1.919671e-04
## [371] 1.901847e-04 1.896575e-04 1.879231e-04 1.873754e-04 1.860726e-04
## [376] 1.836784e-04 1.827440e-04 1.803624e-04 1.797033e-04 1.788469e-04
## [381] 1.780505e-04 1.774885e-04 1.765747e-04 1.740464e-04 1.729008e-04
## [386] 1.720882e-04 1.706243e-04 1.688837e-04 1.683369e-04 1.673550e-04
## [391] 1.663364e-04 1.644107e-04 1.638130e-04 1.628263e-04 1.620527e-04
## [396] 1.612499e-04 1.599876e-04 1.594671e-04 1.580445e-04 1.566543e-04
## [401] 1.560032e-04 1.554623e-04 1.546523e-04 1.541701e-04 1.519165e-04
## [406] 1.516484e-04 1.502332e-04 1.501497e-04 1.488084e-04 1.468952e-04
## [411] 1.457049e-04 1.452348e-04 1.446686e-04 1.432294e-04 1.425745e-04
## [416] 1.423528e-04 1.412281e-04 1.404112e-04 1.398222e-04 1.384594e-04
## [421] 1.383049e-04 1.367253e-04 1.365165e-04 1.354788e-04 1.346013e-04
## [426] 1.336797e-04 1.327818e-04 1.323492e-04 1.316456e-04 1.310451e-04
## [431] 1.307308e-04 1.292952e-04 1.287709e-04 1.279019e-04 1.274838e-04
## [436] 1.260758e-04 1.253907e-04 1.243856e-04 1.238101e-04 1.235593e-04
## [441] 1.229834e-04 1.226126e-04 1.215330e-04 1.207261e-04 1.201679e-04
## [446] 1.196728e-04 1.194362e-04 1.184591e-04 1.177226e-04 1.170609e-04
## [451] 1.158616e-04 1.154538e-04 1.146241e-04 1.135415e-04 1.132472e-04
## [456] 1.127118e-04 1.120691e-04 1.115591e-04 1.109031e-04 1.098425e-04
## [461] 1.091245e-04 1.089466e-04 1.083442e-04 1.078045e-04 1.074078e-04
## [466] 1.070459e-04 1.066884e-04 1.057040e-04 1.054539e-04 1.045120e-04
## [471] 1.036630e-04 1.032192e-04 1.027397e-04 1.021103e-04 1.017783e-04
## [476] 1.006155e-04 9.980013e-05 9.977945e-05 9.893263e-05 9.857210e-05
## [481] 9.761377e-05 9.723756e-05 9.657834e-05 9.623146e-05 9.564770e-05
## [486] 9.528142e-05 9.441901e-05 9.370821e-05 9.315463e-05 9.253826e-05
## [491] 9.240993e-05 9.200704e-05 9.189691e-05 9.087484e-05 9.002478e-05
## [496] 8.995657e-05 8.933572e-05 8.910549e-05 8.820004e-05 8.775689e-05
## [501] 8.743403e-05 8.737725e-05 8.700623e-05 8.612892e-05 8.542730e-05
## [506] 8.515577e-05 8.432578e-05 8.414354e-05 8.388064e-05 8.337563e-05
## [511] 8.260967e-05 8.197271e-05 8.148338e-05 8.115102e-05 8.101425e-05
## [516] 8.022001e-05 7.996207e-05 7.924425e-05 7.903869e-05 7.794296e-05
## [521] 7.785353e-05 7.665284e-05 7.638945e-05 7.616298e-05 7.598042e-05
## [526] 7.536193e-05 7.514461e-05 7.479517e-05 7.378056e-05 7.341338e-05
## [531] 7.322906e-05 7.284149e-05 7.216421e-05 7.182428e-05 7.148795e-05
## [536] 7.107164e-05 7.061075e-05 7.014635e-05 6.965115e-05 6.959718e-05
## [541] 6.933013e-05 6.921734e-05 6.851377e-05 6.839150e-05 6.775969e-05
## [546] 6.700042e-05 6.639283e-05 6.618965e-05 6.563734e-05 6.535309e-05
## [551] 6.504091e-05 6.446289e-05 6.416963e-05 6.390340e-05 6.316640e-05
## [556] 6.287200e-05 6.265388e-05 6.259333e-05 6.233603e-05 6.201612e-05
## [561] 6.126866e-05 6.036222e-05 6.017145e-05 5.993738e-05 5.965068e-05
## [566] 5.930204e-05 5.889339e-05 5.862538e-05 5.852972e-05 5.781498e-05
## [571] 5.754479e-05 5.732929e-05 5.632986e-05 5.621896e-05 5.592795e-05
## [576] 5.550705e-05 5.483720e-05 5.457221e-05 5.430270e-05 5.420401e-05
## [581] 5.393409e-05 5.364745e-05 5.318881e-05 5.276162e-05 5.235915e-05
## [586] 5.226991e-05 5.180868e-05 5.153274e-05 5.078186e-05 5.062402e-05
## [591] 5.018914e-05 4.997664e-05 4.953201e-05 4.910308e-05 4.901593e-05
## [596] 4.853694e-05 4.814239e-05 4.779870e-05 4.757900e-05 4.720262e-05
## [601] 4.702191e-05 4.648818e-05 4.624978e-05 4.597969e-05 4.540359e-05
## [606] 4.522464e-05 4.482383e-05 4.478329e-05 4.406685e-05 4.378868e-05
## [611] 4.363170e-05 4.338326e-05 4.287022e-05 4.274361e-05 4.231431e-05
## [616] 4.182496e-05 4.154823e-05 4.126938e-05 4.108689e-05 4.090979e-05
## [621] 4.035492e-05 4.000912e-05 3.956083e-05 3.922105e-05 3.900474e-05
## [626] 3.893287e-05 3.876845e-05 3.859532e-05 3.800569e-05 3.737744e-05
## [631] 3.724173e-05 3.701784e-05 3.662021e-05 3.649974e-05 3.566638e-05
## [636] 3.556832e-05 3.519172e-05 3.484690e-05 3.418231e-05 3.402460e-05
## [641] 3.354631e-05 3.303341e-05 3.285406e-05 3.245058e-05 3.198546e-05
## [646] 3.187752e-05 3.164642e-05 3.119449e-05 3.025397e-05 3.015705e-05
## [651] 2.943917e-05 2.875357e-05 2.803551e-05 2.764070e-05 2.722639e-05
## [656] 2.651587e-05 2.411464e-05 2.368025e-05 2.259321e-05 1.632515e-05
## [661] 2.418898e-06 7.977559e-07 1.749557e-29 1.281145e-29 6.932915e-32
## [666] 2.079024e-32 2.168930e-33 1.074094e-33 1.348970e-34
plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained ", ylim=c(0,1),type='b')
sum(pve[1:2])
## [1] 0.1046179
sum(pve[1:10])
## [1] 0.3002005
# b
df_test$colour <- as.factor(df_test$Digit)
autoplot(pr.mnist, data = df_test , colour = 'colour') + scale_color_manual(labels = c("0", "1", "2", "3","4", "5", "6", "7", "8", "9"), values = c("blue", "red", "yellow", "pink", "orange", "gray", "black", "green", "purple", "white" ))
# c
df_10c <- as.data.frame(pr.mnist$x[,1:2])
df_10c$Digit <- as.factor(df_test$Digit)
df_10c <- subset(df_10c, Digit == 1 | Digit == 2 | Digit == 4)
ggplot(data = df_10c, aes(PC1,PC2, colour = Digit)) + geom_point()
# d
df_10d <- as.data.frame(pr.mnist$x[,1:2])
df_10d$Digit <- as.factor(df_test$Digit)
df_10d <- subset(df_10d, Digit == 5 | Digit == 3 | Digit == 6)
ggplot(data = df_10d, aes(PC1,PC2, colour = Digit)) + geom_point()
(a). The first 2 Principle Components explain 10.5% of the total variation, whereas the first 10 Principle Components explain 30.0% of the variation in the data.